
##########################################################
# Set up Data for Predictive Validity Exploratory Analysis
##########################################################

# Load raw data ----
load("data_interest_split1_covariates.RData") # data split 1 for Pred Val

# Remove double ids ----
data_interest_split1_covariates %>%   # check all participants that have taken part twice in the survey 
  group_by(id) %>%
  summarise(N_response = n()) %>%
  filter(N_response == 2)   
data_EA_cov<- data_interest_split1_covariates%>% # select only first entry of participants that participated twice 
  filter(!is.na(id)) %>%  # exclude participants with NA as id 
  group_by(id) %>% # group by ID 
  slice(1) %>%   # keep only the first response for each participant
  ungroup()


# Interested or Uninterested treatment
data_EA_cov <- data_EA_cov %>% # nominal
  mutate(Interested = case_when(
    political_interest %in% c(3, 4) ~ "Interested",
    political_interest %in% c(1, 2) ~ "Uninterested",
    T ~ NA
  ))
data_EA_cov <- data_EA_cov %>% # numeric
  mutate(InterestedNum = case_when(
    political_interest %in% c(3, 4) ~ 1,
    political_interest %in% c(1, 2) ~ 0,
    T ~ NA
  ))

data_EA_cov$InterestedNum<-as.factor(data_EA_cov$InterestedNum) # make factor


# Political Interest ----
data_EA_cov <- data_EA_cov %>% 
  rename( pol_interest = political_interest) 

# Interest IDs ----
data_EA_cov <- data_EA_cov %>%
  # positive id sum
  mutate(pos_id_sum = rowSums(dplyr::select(., starts_with("int_pos_id")), na.rm = TRUE))%>%
  # positive id mean
  mutate(pos_id_mean = ifelse(pos_id_sum == 0, NA, pos_id_sum/8))%>%
  # negative id sum
  mutate(neg_id_sum = rowSums(dplyr::select(., starts_with("int_neg_id")), na.rm = TRUE))%>%
  # negative id mean
  mutate(neg_id_mean = ifelse(neg_id_sum == 0, NA, neg_id_sum/8))

data_EA_cov$pos_id_sum[data_EA_cov$pos_id_sum==0]<-NA # make sure the 0 are recognized as NA's
data_EA_cov$neg_id_sum[data_EA_cov$neg_id_sum==0]<-NA # make sure the 0 are recognized as NA's

# Age ----
data_EA_cov$age <- 2023 - (2007 - data_EA_cov$Age_1)

# Sex ----
data_EA_cov <- data_EA_cov %>% 
  rename(
    sex = Sex) %>% 
  mutate(
    sex = case_when(sex == 1 ~ "m", 
                    sex == 2 ~ "f", 
                    sex == 3 ~ "nb", T ~ NA))

data_EA_cov <- data_EA_cov %>%  # add numeric col for sex
  mutate(
    sex_fem = case_when(sex == "m" ~ 0 , 
                        sex == "f"~1, 
                      sex == "nb"~ NA, T ~ NA))


# Education ----
data_EA_cov <- data_EA_cov %>% 
  mutate(
    education = case_when(Education == 1 ~ "VMBO",
                          Education == 2 ~ "HAVO_VW_Gymnasium", 
                          Education == 3 ~ "MBO", 
                          Education == 4 ~ "HBO", 
                          Education == 5 ~ "Universiteit", 
                          Education == 6 ~ "Lagere_School", TRUE ~ NA))


data_EA_cov <- data_EA_cov %>% # numeric
  mutate(
    education_num = case_when(education == "VMBO"~1, # put lagere and VMBO together
                              education == "HAVO_VW_Gymnasium"~2, 
                              education == "MBO"~3, 
                              education == "HBO" ~4, 
                              education ==  "Universiteit" ~ 5, 
                              education == "Lagere_School"~1, TRUE ~ NA))

data_EA_cov$education<-as.factor(data_EA_cov$education)
data_EA_cov$education_num<-as.factor(data_EA_cov$education_num)


# Ideology ----
data_EA_cov <- data_EA_cov %>% 
  rename(
    # 0 = extreem links, 10 = extreem rechts 
    ideology = lr_placement_1
  ) %>% 
  # Calculate ideology strength (0-5)
  mutate(
    ideology_strength = abs(ideology-5)
  )

# Confidence in knowledge ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Recoding 
  mutate(confidence_pre_1 = case_when(confidence_pre_1 == 1 ~ 1, 
                                      confidence_pre_1 == 8 ~ 2, 
                                      confidence_pre_1 == 9 ~ 3, 
                                      confidence_pre_1 == 10 ~ 4, 
                                      confidence_pre_1 == 11 ~ 5, 
                                      confidence_pre_1 == 12 ~ 6, 
                                      confidence_pre_1 == 13 ~ 7, TRUE ~ NA), 
         # this one is phrased negatively, so reverse-code
         confidence_pre_2 = case_when(confidence_pre_2 == 1 ~ 7, 
                                      confidence_pre_2 == 8 ~ 6, 
                                      confidence_pre_2 == 9 ~ 5, 
                                      confidence_pre_2 == 10 ~ 4, 
                                      confidence_pre_2 == 11 ~ 3, 
                                      confidence_pre_2 == 12 ~ 2, 
                                      confidence_pre_2 == 13 ~ 1, TRUE ~ NA), 
         confidence_pre_3 = case_when(confidence_pre_3 == 1 ~ 1, 
                                      confidence_pre_3 == 8 ~ 2, 
                                      confidence_pre_3 == 9 ~ 3, 
                                      confidence_pre_3 == 10 ~ 4, 
                                      confidence_pre_3 == 11 ~ 5, 
                                      confidence_pre_3 == 12 ~ 6, 
                                      confidence_pre_3 == 13 ~ 7, TRUE ~ NA)) %>% 
  # Step 2: Renaming the Variables 
  rename(
    # Confidence in Political Knowledge # 1 = helemaal mee oneens, 7 = helemaal mee eens 
    pol_confidence_1 = confidence_pre_1, 
    pol_confidence_2 = confidence_pre_2, 
    pol_confidence_3 = confidence_pre_3, 
  ) %>% 
  # Step 3: Calculate Mean Confidence 
  mutate(
    pol_confidence = rowMeans(dplyr::select(., pol_confidence_1, pol_confidence_2, pol_confidence_3), na.rm = T)
  )


# Political knowledge ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename Variables 
  rename(
    knowledge_1 = political_knowledge1, # 1 = correct, 0 = incorrect
    knowledge_2 = political_knowledge2, # 1 = correct, 0 = incorrect 
    knowledge_3 = political_knowledge3, # 1 = correct, 0 = incorrect
    knowledge_4 = political_knowledge4, # 1 = correct, 0 = incorrect
    knowledge_5 = political_knowledge5, # 1 = correct, 0 = incorrect
  ) %>% 
  # Step 2: Get Mean Knowledge 
  mutate(
    pol_knowledge = rowMeans(dplyr::select(., knowledge_2, knowledge_3, knowledge_4, knowledge_5), na.rm = T)
  )

# Political efficacy ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Recoding 
  mutate(
    efficacy_1 = case_when(efficacy_1 == 1 ~ 1, 
                           efficacy_1 == 2 ~ 2, 
                           efficacy_1 == 3 ~ 3, 
                           efficacy_1 == 4 ~ 4, 
                           efficacy_1 == 5 ~ 5, 
                           efficacy_1 == 8 ~ 6, 
                           efficacy_1 == 9 ~ 7, TRUE ~ NA), 
    efficacy_2 = case_when(efficacy_2 == 1 ~ 1, 
                           efficacy_2 == 2 ~ 2, 
                           efficacy_2 == 3 ~ 3, 
                           efficacy_2 == 4 ~ 4, 
                           efficacy_2 == 5 ~ 5, 
                           efficacy_2 == 8 ~ 6, 
                           efficacy_2 == 9 ~ 7, TRUE ~ NA),
    efficacy_3 = case_when(efficacy_3 == 1 ~ 7, 
                           efficacy_3 == 2 ~ 6, 
                           efficacy_3 == 3 ~ 5, 
                           efficacy_3 == 4 ~ 4, 
                           efficacy_3 == 5 ~ 3, 
                           efficacy_3 == 8 ~ 2, 
                           efficacy_3 == 9 ~ 1, TRUE ~ NA),
    efficacy_4 = case_when(efficacy_4 == 1 ~ 7, 
                           efficacy_4 == 2 ~ 6, 
                           efficacy_4 == 3 ~ 5, 
                           efficacy_4 == 4 ~ 4, 
                           efficacy_4 == 5 ~ 3, 
                           efficacy_4 == 8 ~ 2, 
                           efficacy_4 == 9 ~ 1, TRUE ~ NA),
    efficacy_5 = case_when(efficacy_5 == 1 ~ 7, 
                           efficacy_5 == 2 ~ 6, 
                           efficacy_5 == 3 ~ 5, 
                           efficacy_5 == 4 ~ 4, 
                           efficacy_5 == 5 ~ 3, 
                           efficacy_5 == 8 ~ 2, 
                           efficacy_5 == 9 ~ 1, TRUE ~ NA),
    efficacy_6 = case_when(efficacy_6 == 1 ~ 7, 
                           efficacy_6 == 2 ~ 6, 
                           efficacy_6 == 3 ~ 5, 
                           efficacy_6 == 4 ~ 4, 
                           efficacy_6 == 5 ~ 3, 
                           efficacy_6 == 8 ~ 2, 
                           efficacy_6 == 9 ~ 1, TRUE ~ NA)
  ) %>% 
  # Step 2: Build Mean Score for Participants 
  mutate(pol_efficacy = rowMeans(dplyr::select(., efficacy_1, efficacy_2, efficacy_3, efficacy_4, efficacy_5, efficacy_6), na.rm = T))%>%
  # Step 3: Internal political efficacy
  mutate(int_pol_efficacy = rowMeans(dplyr::select(., efficacy_1, efficacy_2, efficacy_6), na.rm = T))%>%
  # Step 3: External political efficacy
  mutate(ext_pol_efficacy = rowMeans(dplyr::select(., efficacy_3, efficacy_4, efficacy_5), na.rm = T))

# Political activism ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename columns
  rename(
    # Political Activism (1-7)
    activism_1 = pol_alternative_1, 
    activism_2 = pol_alternative_2, 
    activism_3 = pol_alternative_3, 
    activism_4 = pol_alternative_4, 
    activism_5 = pol_alternative_5) %>% 
  
  # Step 2: Create Mean Score 
  mutate(activism = rowMeans(dplyr::select(., activism_1, activism_2, activism_3, activism_4, activism_5), na.rm = T))


# News consumption ----
data_EA_cov <- data_EA_cov %>% 
  # General News Consumption Frequency (1 = helemaal niet vaak, 7 = heel vaak )
  rename (pol_news_con = Political_engagement_1)

# News openness ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  rename(
    # Scale from 1-7
    news_aversion_1 = `News avoidance items_1`, 
    news_aversion_2 = `News avoidance items_2`, 
    news_aversion_3 = `News avoidance items_3`
  ) %>% 
  # Step 2: Get Mean News Aversion 
  mutate(
    news_aversion = rowMeans(dplyr::select(., news_aversion_1, news_aversion_2, news_aversion_3), na.rm = T)
  )%>%
  # Step 3: Recode so that we get news openness
  mutate(
    # Scale from 1-7
    news_openness_1 = case_when(news_aversion_1 == 1 ~ 7, 
                                news_aversion_1 == 2 ~ 6, 
                                news_aversion_1 == 3 ~ 5, 
                                news_aversion_1 == 4 ~ 4, 
                                news_aversion_1 == 5 ~ 3, 
                                news_aversion_1 == 6 ~ 2, 
                                news_aversion_1 == 7 ~ 1, TRUE ~ NA), 
    news_openness_2 = case_when(news_aversion_2 == 1 ~ 7, 
                                news_aversion_2 == 2 ~ 6, 
                                news_aversion_2 == 3 ~ 5, 
                                news_aversion_2 == 4 ~ 4, 
                                news_aversion_2 == 5 ~ 3, 
                                news_aversion_2 == 6 ~ 2, 
                                news_aversion_2 == 7 ~ 1, TRUE ~ NA), 
    news_openness_3 = case_when(news_aversion_3 == 1 ~ 7, 
                                news_aversion_3 == 2 ~ 6, 
                                news_aversion_3 == 3 ~ 5, 
                                news_aversion_3 == 4 ~ 4, 
                                news_aversion_3 == 5 ~ 3, 
                                news_aversion_3 == 6 ~ 2, 
                                news_aversion_3 == 7 ~ 1, TRUE ~ NA))%>%
  mutate(news_openness = rowMeans(dplyr::select(., news_openness_1, news_openness_3, news_openness_3), na.rm = T)
  )

# Personality ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  rename(
    # extraversion (1-5)
    e1 = B5_1, 
    e2r = B5_6, 
    e3 = B5_11, 
    e4r = B5_16, 
    
    # agreeableness (1-5)
    a1 = B5_2, 
    a2r = B5_7, 
    a3 = B5_12, 
    a4r = B5_17, 
    
    # neuroticism (1-5)
    n1 = B5_4, 
    n2r = B5_9, 
    n3 = B5_14,
    n4r = B5_19,  
    
    # conscientiousness (1-5)
    c1 = B5_3, 
    c2r = B5_8, 
    c3 = B5_13, 
    c4r = B5_18, 
    
    # openness to experiences (1-5)
    o1 = B5_5, 
    o2r = B5_10, 
    o3r = B5_15, 
    o4r = B5_20) %>% 
  
  # Step 2: Recode
  mutate(
    # Extraversion
    # Adjust reverse-scored items (Please check on Qualtrics)
    e2r = e2r*-1 + 6, 
    e4r = e4r*-1 + 6, 
    
    # Agreeableness 
    # Adjust reverse-scored items (Please check on Qualtrics)
    a2r = a2r*-1+6,
    a4r = a4r*-1+6, 
    
    # Neuroticism 
    # Adjust reverse-scored items (Please check on Qualtrics)
    n2r = n2r*-1+6, 
    n4r = n4r*-1+6, 
    
    # Conscientiousness 
    # Adjust reverse-scored items (Please check on Qualtrics)
    c2r = c2r*-1+6, 
    c4r = c4r*-1+6, 
    
    # Openness 
    # Adjust reverse-scored items (Please check on Qualtrics)
    o2r = o2r*-1+6, 
    o3r = o3r*-1+6, 
    o4r = o4r*-1+6) %>% 
  # Step 3: Calculate Means
  mutate(
    e = rowMeans(dplyr::select(., e1, e2r, e3, e4r), na.rm = T), 
    a = rowMeans(dplyr::select(., a1, a2r, a3, a4r), na.rm = T), 
    n = rowMeans(dplyr::select(., n1, n2r, n3, n4r), na.rm = T), 
    c = rowMeans(dplyr::select(., c1, c2r, c3, c4r), na.rm = T), 
    o = rowMeans(dplyr::select(., o1, o2r, o3r, o4r), na.rm = T)
  )

# Need for affect ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  # I inferred this by looking at Qualtrics - this should be double-checked
  rename(
    # need for affect (1-5)
    nfa1r = NfA_1, 
    nfa2 = NfA_2, 
    nfa3 = NfA_3, 
    nfa4 = NfA_4, 
    nfa5r = NfA_5, 
    nfa6r = NfA_6, 
    nfa7r = NfA_7,
    nfa8r = NfA_8, 
    nfa9 = NfA_9, 
    nfa10 = NfA_10
  ) %>% 
  # Step 2: Recode 
  mutate(
    # Adjust reverse-scored items (please check on Qualtrics)
    nfa1r = nfa1r*-1+6, 
    nfa5r = nfa5r*-1+6, 
    nfa6r = nfa6r*-1+6, 
    nfa7r = nfa7r*-1+6, 
    nfa8r = nfa8r*-1+6) %>% 
  # Step 3: Calculate Means
  mutate(
    nfa = rowMeans(dplyr::select(., nfa1r, nfa2, nfa3, nfa4, nfa5r, nfa6r, nfa7r, nfa8r, nfa9, nfa10), na.rm = T)
  )

# Need for cognition ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  # I inferred this by looking at Qualtrics - this should be double-checked
  rename(
    # need for cognition (1-7)
    nfc1 = NfC_1, 
    nfc2 = NfC_2, 
    nfc3r = NfC_3, 
    nfc4r = NfC_4, 
    nfc5r = NfC_5, 
    nfc6r = NfC_6, 
    nfc7 = NfC_7, 
    nfc8 = NfC_8, 
    nfc9r = NfC_9, 
    nfc10 = NfC_10) %>% 
  # Step 2: Recode
  mutate(    
    # Adjust reverse-scored items (please check on Qualtrics)
    nfc3r = nfc3r*-1+8, 
    nfc4r = nfc4r*-1+8, 
    nfc5r = nfc5r*-1+8, 
    nfc6r = nfc6r*-1+8, 
    nfc9r = nfc9r*-1+8) %>% 
  # Step 3: Calculate Means
  mutate(
    # Calculate Mean 
    nfc = rowMeans(dplyr::select(., nfc1, nfc2, nfc3r, nfc4r, nfc5r, nfc6r, nfc7, nfc8, nfc9r, nfc10), na.rm = T))

# Social dominance ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  # I inferred this by looking at Qualtrics - this should be double-checked
  rename(
    # need for affect (1-5)
    sdo1 = SDO_1, 
    sdo2 = SDO_2, 
    sdo3r = SDO_3, 
    sdo4r = SDO_4, 
    sdo5 = SDO_5, 
    sdo6 = SDO_6, 
    sdo7r = SDO_7,
    sdo8r = SDO_8
  ) %>% 
  # Step 2: Recode 
  mutate(
    # Adjust reverse-scored items (please check on Qualtrics)
    sdo3r = nfa1r*-1+7, 
    sdo4r = nfa5r*-1+7, 
    sdo7r = nfa6r*-1+7, 
    sdo8r = nfa7r*-1+7) %>% 
  # Step 3: Calculate Means
  mutate(
    sdo = rowMeans(dplyr::select(., sdo1, sdo2, sdo3r, sdo4r, sdo5, sdo6, sdo7r, sdo8r), na.rm = T)
  )


# Dogmatism ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Rename the Variables 
  # I inferred this by looking at Qualtrics - this should be double-checked
  rename(
    # need for affect (1-5)
    dog1 = dogmatism_1, 
    dog2r = dogmatism_2, 
    dog3 = dogmatism_3, 
    dog4r = dogmatism_4, 
    dog5 = dogmatism_5, 
    dog6r = dogmatism_6, 
    dog7r = dogmatism_7,
    dog8 = dogmatism_8,
    dog9 = dogmatism_9,
    dog10r = dogmatism_10,
    dog11r = dogmatism_11
  ) %>% 
  # Step 2: Recode 
  mutate(
    # Adjust reverse-scored items (please check on Qualtrics)
    dog2r = dog2r*-1+6, 
    dog4r = dog4r*-1+6, 
    dog6r = dog6r*-1+6, 
    dog7r = dog7r*-1+6, 
    dog10r = dog10r*-1+6,
    dog11r = dog11r*-1+6) %>% 
  # Step 3: Calculate Means
  mutate(
    dogmatism = rowMeans(dplyr::select(., dog1, dog2r, dog3, dog4r, dog5, dog6r, dog7r, dog8, dog9, dog10r, dog11r), na.rm = T)
  )

# Self-esteem ----
data_EA_cov <- data_EA_cov %>% 
  # Step 1: Recoding (If I see it correctly, some items are weirdly coded on Qualtrics)
  mutate(
    self_esteem_1 = case_when(`self-esteem_1` == 1 ~ 1, 
                              `self-esteem_1` == 2 ~ 2, 
                              `self-esteem_1` == 3 ~ 3, 
                              `self-esteem_1` == 4 ~ 4, 
                              `self-esteem_1` == 5 ~ 5, 
                              `self-esteem_1` == 6 ~ 6, 
                              `self-esteem_1` == 7 ~ 7, TRUE ~ NA), 
    self_esteem_2 = case_when(`self-esteem_2` == 1 ~ 7, #rec
                              `self-esteem_2` == 2 ~ 6, 
                              `self-esteem_2` == 3 ~ 5, 
                              `self-esteem_2` == 4 ~ 4, 
                              `self-esteem_2` == 5 ~ 3, 
                              `self-esteem_2` == 6 ~ 2, 
                              `self-esteem_2` == 7 ~ 1, TRUE ~ NA),
    self_esteem_3 = case_when(`self-esteem_3` == 1 ~ 7, #rec
                              `self-esteem_3` == 2 ~ 6, 
                              `self-esteem_3` == 3 ~ 5, 
                              `self-esteem_3` == 4 ~ 4, 
                              `self-esteem_3` == 5 ~ 3, 
                              `self-esteem_3` == 6 ~ 2, 
                              `self-esteem_3` == 7 ~ 1, TRUE ~ NA),
    self_esteem_4 = case_when(`self-esteem_4` == 1 ~ 7, #rec
                              `self-esteem_4` == 2 ~ 6, 
                              `self-esteem_4` == 3 ~ 5, 
                              `self-esteem_4` == 4 ~ 4, 
                              `self-esteem_4` == 5 ~ 3, 
                              `self-esteem_4` == 6 ~ 2, 
                              `self-esteem_4` == 7 ~ 1, TRUE ~ NA),
    self_esteem_5 = case_when(`self-esteem_5` == 1 ~ 1, 
                              `self-esteem_5` == 2 ~ 2, 
                              `self-esteem_5` == 3 ~ 3, 
                              `self-esteem_5` == 4 ~ 4, 
                              `self-esteem_5` == 5 ~ 5, 
                              `self-esteem_5` == 8 ~ 6, 
                              `self-esteem_5` == 9 ~ 7, TRUE ~ NA)
  ) %>% 
  # Step 2: Build Mean Score for Participants 
  mutate(self_esteem = rowMeans(dplyr::select(., self_esteem_1, self_esteem_2, self_esteem_3, self_esteem_4, self_esteem_5), na.rm = T))


# Standardize variables ----
data_EA_cov_std<-data_EA_cov %>%
  # apply std only to numeric variables
  mutate_if(is.numeric, scale) %>% 
  # get a data frame
  as.data.frame()

# Create Analysis variable ----
data_EA_cov$Analysis<-0

rm(data_interest_split1_covariates) # remove old data set
